{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import nltk\n",
    "import json\n",
    "import itertools\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "{\n",
    "  \"asin\": \"B000050B6Z\",\n",
    "  \"questionType\": \"yes/no\",\n",
    "  \"answerType\": \"Y\",\n",
    "  \"answerTime\": \"Aug 8, 2014\",\n",
    "  \"unixTime\": 1407481200,\n",
    "  \"question\": \"Can you use this unit with GEL shaving cans?\",\n",
    "  \"answer\": \"Yes. If the can fits in the machine it will despense hot gel lather. I've been using my machine for both , gel and traditional lather for over 10 years.\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['BEGIN',\n",
       " 'the',\n",
       " 'clouds',\n",
       " 'are',\n",
       " 'in',\n",
       " 'the',\n",
       " 'sky',\n",
       " '.',\n",
       " 'END',\n",
       " 'PAD',\n",
       " 'PAD',\n",
       " 'PAD',\n",
       " 'PAD',\n",
       " 'PAD',\n",
       " 'PAD']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "start_token = \"BEGIN\"\n",
    "end_token = \"END\"\n",
    "pad_token = \"PAD\"\n",
    "unknow_token = \"UNKNOW\"\n",
    "\n",
    "with open(\"Downloads/qa_Appliances_quote.json\") as json_file:\n",
    "    json_data = json.load(json_file)\n",
    "\n",
    "question_sent = [x['question'].lower() for x in json_data]\n",
    "tokenized_cent = [nltk.word_tokenize(x)[:13] for x in question_sent]\n",
    "for i, cent in enumerate(tokenized_cent):\n",
    "    cent.append(end_token)\n",
    "    cent.insert(0, start_token)\n",
    "    while(len(cent)<15):\n",
    "        cent.append(pad_token)\n",
    "tokenized_cent[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1831"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word_freq = nltk.FreqDist(itertools.chain(*tokenized_cent)) # 2327\n",
    "len(word_freq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "vocabulary_size = 1800\n",
    "vocab = word_freq.most_common(vocabulary_size)\n",
    "index_2_word = dict([(i, w[0]) for i,w in enumerate(vocab)])\n",
    "index_2_word[vocabulary_size] = unknow_token\n",
    "\n",
    "word_2_index = dict([(index_2_word[i], i) for i,w in enumerate(index_2_word)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for i, sent in enumerate(tokenized_cent):\n",
    "    tokenized_cent[i] = [w if w in word_2_index else unknow_token for w in sent]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_pre_train = np.asarray([[word_2_index[w] for w in sent[:-1]] for sent in tokenized_cent[:800]])\n",
    "X_pre_text = np.asarray([[word_2_index[w] for w in sent[:-1]] for sent in tokenized_cent[800:1000]])\n",
    "Y_pre_train = np.asarray([[word_2_index[w] for w in sent[1:]] for sent in tokenized_cent[:800]])\n",
    "Y_pre_test = np.asarray([[word_2_index[w] for w in sent[1:]] for sent in tokenized_cent[800:1000]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1801"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_pre_train.shape\n",
    "len(word_2_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_train = np.eye(vocabulary_size+1)[X_pre_train]\n",
    "X_test = np.eye(vocabulary_size+1)[X_pre_text]\n",
    "Y_train = np.eye(vocabulary_size+1)[Y_pre_train]\n",
    "Y_test = np.eye(vocabulary_size+1)[Y_pre_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.,  1.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       ..., \n",
       "       [ 0.,  0.,  1., ...,  0.,  0.,  0.],\n",
       "       [ 1.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 1.,  0.,  0., ...,  0.,  0.,  0.]])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1801"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class SmapleRNN:\n",
    "    def __init__(self, input_dim, hidden_dim=50):\n",
    "        # 设置RNN网络的输入、输出、隐层的维度\n",
    "        self.input_dim = input_dim\n",
    "        self.output_dim = input_dim\n",
    "        self.hidden_dim = hidden_dim\n",
    "        \n",
    "        # 定义RNN网络权重大小\n",
    "        U_size = (self.hidden_dim, self.input_dim)\n",
    "        W_size = (self.hidden_dim, self.hidden_dim)\n",
    "        V_size = (self.input_dim, self.hidden_dim)\n",
    "        \n",
    "        # 随机初始化RNN网络权重参数\n",
    "        random_min = -np.sqrt(1./self.input_dim)\n",
    "        random_max = np.sqrt(1./self.input_dim)\n",
    "        self.U = np.random.uniform(random_min, random_max, U_size)\n",
    "        self.W = np.random.uniform(random_min, random_max, W_size)\n",
    "        self.V = np.random.uniform(random_min, random_max, V_size)\n",
    "        \n",
    "    def _softmax(self, x):\n",
    "        \"\"\"Compute softmax values for each sets of scores in x.\"\"\"\n",
    "        e_x = np.exp(x - np.max(x))\n",
    "        return e_x / e_x.sum(axis=0)\n",
    "    \n",
    "    def _tanh(self, x):\n",
    "        \"\"\"Sample translate the input data to numpy tanh function.\"\"\"\n",
    "        return np.tanh(x)\n",
    "    \n",
    "    def forward_propagation(self, x):\n",
    "        \"\"\"向前传播算法\"\"\"\n",
    "        time = x.shape[0]    # 获取输入数据中的序列数time\n",
    "\n",
    "        s = np.zeros((time+1, self.hidden_dim)) # (15, 50) 定义状态矩阵\n",
    "        h = np.zeros((time, self.input_dim))  # (14, 1801) 定义输出\n",
    "        \n",
    "        # 根据公式计算隐层状态  和输出  的值\n",
    "        for t in np.arange(time):\n",
    "            s[t] = self._tanh(np.dot(self.U, x[t]) + np.dot(self.W, s[t-1]))\n",
    "            h[t] = self._softmax(self.V.dot(s[t]))\n",
    "\n",
    "        return (s, h)\n",
    "    \n",
    "    def predict(self, x):\n",
    "        \"\"\"根据输入序列数据预测序列输出\"\"\"\n",
    "        s, h = self.forward_propagation(x)\n",
    "        \n",
    "        print(\"input shape:{}.\".format(x.shape))    #>>> input shape:(14, 1801).\n",
    "        print(\"status shape:{}.\".format(s.shape))    #>>> status shape:(15, 50).\n",
    "        print(\"output shape:{}.\".format(h.shape))    #>>> output shape:(14, 1801).\n",
    "        \n",
    "        return np.argmax(h, axis=1)\n",
    "    \n",
    "    def calc_loss(self, x, y):\n",
    "        \"\"\"计算损失\"\"\" \n",
    "        loss = 0\n",
    "        time = y.shape[0]\n",
    "        \n",
    "        s, y_predict = self.forward_propagation(x)\n",
    "\n",
    "        # t时间上的损失\n",
    "        loss_t = np.sum(y*(np.log(y_predict)), axis=1)\n",
    "        \n",
    "        # 对单个时间序列上的损失求和，然后求平均\n",
    "        total_loss = - np.sum(loss_t) / time\n",
    "        \n",
    "        return total_loss\n",
    "    \n",
    "    def backward_propagation_though_time(self, x, y):\n",
    "        bptt_truncate = 4\n",
    "        \n",
    "        time = y.shape[0]\n",
    "        \n",
    "        s, y_predict = SampleRNN.forward_propagation(x)\n",
    "        \n",
    "        dE_dV = np.zeros_like(self.V)\n",
    "        dE_dW = np.zeros_like(self.W)\n",
    "        dE_dU = np.zeros_like(self.U)\n",
    "        \n",
    "        dE_dy = y - y_predict\n",
    "        \n",
    "        for t in reversed(range(time)):\n",
    "            dE_dV += np.outer(dE_dy[t], s[t].T)\n",
    "            \n",
    "            # 首先计算 delta t，在第一次计算的时候 t = 3 \n",
    "            delta_k = (self.V * dE_dy) * (1 - pow(s[t], 2))\n",
    "            \n",
    "            # 开始BPTT步骤 \n",
    "            for step_t in reversed(arange(t - bptt_truncate, t + 1)):\n",
    "                dE_dW += np.outer(delta_k, s[step_t-1]) # 加到之前每一步的梯度上\n",
    "                dE_dU[:, x[step_t]] += delta_k\n",
    "                delta_k = (self.W * delta_k) * (1 - pow(s[step_t-1], 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input shape:(14, 1801).\n",
      "status shape:(15, 50).\n",
      "output shape:(14, 1801).\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "7.4957269394002211"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rnn_model = SmapleRNN(vocabulary_size + 1)\n",
    "y_predict = rnn_model.predict(X_train[100])\n",
    "rnn_model.calc_loss(X_train[100], Y_train[100])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "y_predict.shape\n",
    "y_words = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'oven main 89q ceilings cooktop missing 390 website ps890565 space space space space space'"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_predict_sent = [index_2_word[i] for i in y_predict]\n",
    "y_predict_sent = ' '.join(y_predict_sent)\n",
    "y_predict_sent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "14\n",
      "13\n",
      "12\n",
      "11\n",
      "10\n"
     ]
    }
   ],
   "source": [
    "for a in np.arange(max(0, 14-4), 14+1)[::-1]:\n",
    "    print(a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([  1, 258, 881, 882,  66,  83, 883,   3,   2,   0,   0,   0,   0,   0])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train[100].shape\n",
    "np.argmax(X_train[100], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "y = Y_train[100]\n",
    "np.argmax(Y_train[100], axis=1)\n",
    "T=len(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "104.93758721437959"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "14*np.log(1800)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def gradients_clipping(g, th = 0.8):\n",
    "    \"\"\"梯度截断：根据给定的阈值在每一次求得导数后进行判别\"\"\"\n",
    "    if( g > th ):\n",
    "        g = th/abs(g) * g # abs(g) 取导数的绝对值\n",
    "    return g"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
